library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.2.1     v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(imager)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following object is masked from 'package:stringr':
## 
##     boundary
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(ggthemes)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(radiant.data)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
## 
## Attaching package: 'radiant.data'
## The following objects are masked from 'package:lubridate':
## 
##     month, wday
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following objects are masked from 'package:purrr':
## 
##     is_double, is_empty, is_numeric
## The following object is masked from 'package:ggplot2':
## 
##     diamonds
library(corrplot)
## corrplot 0.84 loaded
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
DATASET.PATH = '../data/ava_downloader/AVA_dataset'
train_df <-  read_csv(glue("{DATASET.PATH}/train.csv"))
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
test_df <- read_csv(glue("{DATASET.PATH}/test.csv"))
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
anti <- train_df %>% inner_join(test_df, by='image.id')
anti
intersect(train_df$image.id, test_df$image.id) 
## numeric(0)
show.images <-  function(n, df){
  
  layout(matrix(1:n,ncol=4,byr=T))

   for(i in seq(1,n,1)){
    image.id <- df$image.id[i]
    plot(load.image(glue("{DATASET.PATH}/images/{image.id}.jpg")))
   }
}
files <- read_csv(glue("{DATASET.PATH}/image_attributes.csv")) %>% mutate(image.id = as.integer(str_sub(filename, 0, -5)))  %>% select(-filename) %>% mutate(resolution = width*height)
## Parsed with column specification:
## cols(
##   aspect_ratio = col_double(),
##   depth = col_double(),
##   file_size = col_double(),
##   filename = col_character(),
##   height = col_double(),
##   img_size = col_double(),
##   width = col_double()
## )
ava.image.dataset <- as.tibble(read_delim(glue("{DATASET.PATH}/AVA.txt"), " ", col_names = F)) 
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_double(),
##   X9 = col_double(),
##   X10 = col_double(),
##   X11 = col_double(),
##   X12 = col_double(),
##   X13 = col_double(),
##   X14 = col_double(),
##   X15 = col_double()
## )
colnames(ava.image.dataset) <- c('index', 'image.id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10','semantic.tag.id1', 'semantic.tag.id2', 'challenge.id') 
ava.image.dataset <- ava.image.dataset %>% mutate(image.id=as.integer(image.id)) %>% select(-index)


ava.image.dataset <- ava.image.dataset %>% inner_join(ava.image.dataset %>% select(-semantic.tag.id1, -semantic.tag.id2, -challenge.id) %>% gather(-image.id, key = 'rating', value = 'number')  %>% group_by(image.id) %>% summarise(rating.mean=weighted.mean(as.numeric(rating), number), rating.sd = weighted.sd(as.numeric(rating), number)) %>%  mutate(rating.mean.bucket=cut(rating.mean, breaks=1:10, labels=1:9)), by = c("image.id"))

Overview

Check for missing files

There are 255508 files, but 255530 images in dataset.

Date entries without image file were deleted.

Summary

The dataset has 255530 rows and 17 columns.

Columns: image.id, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, semantic.tag.id1, semantic.tag.id2, challenge.id, rating.mean, rating.sd, rating.mean.bucket

Sample rows

ava.image.dataset %>% arrange(desc(image.id)) %>% head()
ava.image.dataset <- ava.image.dataset %>% inner_join(files)
## Joining, by = "image.id"
rm(files)

Univariate Analysis

plot.dist.num.ratings <- ava.image.dataset %>% select(image.id, 1,2,3,4,5,6,7,8,9, 10) %>% gather(-image.id, key = 'rating', value = 'number') %>%
mutate(rating=as.integer(rating)) %>% group_by(image.id) %>% summarize(number=sum(number)) %>%
  ggplot() +
  geom_histogram(aes(number), fill='steelblue', bins=50)
plot.dist.rating.mean <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(rating.mean), fill='steelblue', bins=70) 
plot.bar.rating.mean.bucket <- ava.image.dataset %>% group_by(rating.mean.bucket) %>%
  summarise(n=n()) %>%
  ggplot() +
  geom_bar(aes(rating.mean.bucket, n), fill='steelblue', stat='identity') +
  scale_y_continuous(labels=comma) +
  geom_hline(yintercept = 1000)
plot.dist.aspect.ratio <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(aspect_ratio), fill='steelblue', bins=30)  +
  scale_x_continuous(breaks=seq(0,5,0.25))
plot.dist.resolution <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(resolution), fill='steelblue', bins=30)  
plot.dist.file.size <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(file_size), fill='steelblue', bins=30)  
plot.dist.img.size <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(img_size), fill='steelblue', bins=30)  
plot.dist.width <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(width), fill='steelblue', bins=30)  
plot.dist.height<- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(height), fill='steelblue', bins=30)  
plot.dist.depth <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(depth), fill='steelblue', bins=30)  
plot.dist.rating.sd <- ava.image.dataset %>%
  ggplot() +
  geom_histogram(aes(rating.sd), fill='steelblue', bins=30)  
grid.arrange(plot.dist.rating.mean,  plot.dist.rating.sd, plot.dist.num.ratings, plot.bar.rating.mean.bucket, ncol=2)

grid.arrange(plot.dist.aspect.ratio, plot.dist.file.size, plot.dist.img.size, plot.dist.width, plot.dist.height, plot.dist.depth, plot.dist.resolution, ncol=3)

Best rated Images

show.images(8, ava.image.dataset %>% arrange(desc(rating.mean)))

Worst rated Images

show.images(8, ava.image.dataset %>% arrange(rating.mean))

Bivariate Analysis

corrplot(cor(ava.image.dataset %>% select(rating.mean, rating.sd, file_size, width, height, img_size, aspect_ratio, resolution)), method = "pie")

corr.rating.mean.file.size <- ava.image.dataset %>%
  ggplot(aes(x=rating.mean, y=file_size)) +
  geom_point(color='steelblue', alpha=0.4) 
corr.rating.mean.img.size <- ava.image.dataset %>%
  ggplot(aes(x=rating.mean, y=img_size)) +
  geom_point(color='steelblue', alpha=0.4) 
corr.rating.mean.aspect.ratio <- ava.image.dataset %>%
  ggplot(aes(x=rating.mean, y=aspect_ratio)) +
  geom_point(color='steelblue', alpha=0.4)  
corr.rating.mean.resolution <- ava.image.dataset %>%
  ggplot(aes(x=rating.mean, y=resolution)) +
  geom_point(color='steelblue', alpha=0.4)  
grid.arrange(corr.rating.mean.file.size, corr.rating.mean.img.size, corr.rating.mean.aspect.ratio, corr.rating.mean.resolution, ncol=2)

Extraction of data for Model Iteration 1

table(ava.image.dataset$rating.mean.bucket)
## 
##      1      2      3      4      5      6      7      8      9 
##      6    540   7542  66582 131163  46430   3201     44      0
ava.image.dataset$rating.mean.bucket2 <- ava.image.dataset$rating.mean.bucket 


ava.image.dataset <- ava.image.dataset %>%  mutate(rating.mean.bucket2 = replace(rating.mean.bucket2, rating.mean.bucket2==2, 3))
ava.image.dataset <- ava.image.dataset %>%  mutate(rating.mean.bucket2 = replace(rating.mean.bucket2, rating.mean.bucket2==1, 3))
ava.image.dataset <- ava.image.dataset %>%  mutate(rating.mean.bucket2 = replace(rating.mean.bucket2, rating.mean.bucket2==8, 7))
table(ava.image.dataset$rating.mean.bucket2)
## 
##      1      2      3      4      5      6      7      8      9 
##      0      0   8088  66582 131163  46430   3245      0      0
ava.image.dataset <- ava.image.dataset %>% arrange(rating.mean) 
#write.csv(ava.image.dataset, glue("{DATASET.PATH}/images_meta.csv"))